library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.6.3
## -- Attaching packages ------------------------------------------------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.0     v purrr   0.3.3
## v tibble  3.0.0     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## Warning: package 'ggplot2' was built under R version 3.6.3
## Warning: package 'tibble' was built under R version 3.6.3
## Warning: package 'tidyr' was built under R version 3.6.3
## Warning: package 'dplyr' was built under R version 3.6.3
## Warning: package 'forcats' was built under R version 3.6.3
## -- Conflicts --------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.3
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(lattice)
library(DataExplorer)
## Warning: package 'DataExplorer' was built under R version 3.6.3
# Get barstool data off github
job_post <- read_csv("https://raw.githubusercontent.com/luizmalpele/datasets/master/fake_job_postings.csv")
## Parsed with column specification:
## cols(
##   job_id = col_double(),
##   title = col_character(),
##   location = col_character(),
##   department = col_character(),
##   salary_range = col_character(),
##   company_profile = col_character(),
##   description = col_character(),
##   requirements = col_character(),
##   benefits = col_character(),
##   telecommuting = col_double(),
##   has_company_logo = col_double(),
##   has_questions = col_double(),
##   employment_type = col_character(),
##   required_experience = col_character(),
##   required_education = col_character(),
##   industry = col_character(),
##   `function` = col_character(),
##   fraudulent = col_double()
## )
job_post

Plot Missing Values

plot_missing(job_post)

Missing Information Comparison

job_post %>%
  select(fraudulent, department, required_education, benefits, required_experience,salary_range, location, requirements, company_profile, employment_type, industry) %>% 
  group_by(fraudulent) %>% 
  summarize(na_ratio_salary = sum(is.na(salary_range))/length(salary_range),
            na_ratio_department = sum(is.na(department))/length(department),
            na_ratio_required_education = sum(is.na(required_education))/length(required_education),
            na_ratio_benefits = sum(is.na(benefits))/length(benefits),
            na_ratio_requirements = sum(is.na(requirements))/length(requirements),
            na_ratio_company_profile = sum(is.na(company_profile))/length(company_profile),
            na_ratio_location = sum(is.na(location))/length(location),
            na_ratio_employment_type = sum(is.na(employment_type))/length(employment_type),
            na_ratio_industry = sum(is.na(industry))/length(industry)
            )

The variables that presented higher missing information ratio are: company_profile and employment_type.

job_post %>% group_by(fraudulent) %>% 
  summarize(ratio_has_questions = sum(has_questions)/length(has_questions),
            ratio_has_company_logo = sum(has_company_logo)/length(has_company_logo),
            ratio_telecommuting = sum(telecommuting)/length(telecommuting)) 

The variables that presented higher missing information ratio are: ratio_has_questions and ratio_has_company_logo. The next step is to investigate the titles and decriptions using data mining and text mining tecniques.